f1_exome.sample.filtering.Rmd - Remove Contaminated and Low Coverage Samples. - PC1 threshold. - <1% human contamination. - >0.5x coverage.

f2_exome.sample.filtering.Rmd - Remove Related Samples. - Remove first order relatives as estimated using ngsRelate.

f3_exome.sample.filtering.Rmd - Remove Samples Which Do Not Cluster Correctly in Demographic Analyses.** - Analyse PCAngsd and NGSadmix results. - Identify poor quality samples and sample swaps.

f4_exome.sample.filtering.Rmd - Reassign Swapped Samples - Identify samples which have been misslabeled and reassign them to their correct communities

f3_exome.sample.filtering.Rmd - Remove Related Swapped Samples - After reassigning swapped samples, we need to check they are not related to samples in their new (correct) communities. - This is needed because the first stage of removing relatives was ran within each of the original communities.

Setup

rm(list=ls())
knitr::opts_knit$set(root.dir = '~/OneDrive - University College London/Projects/Ostridge_PanAf/sample_filtering/exome/') 
min.cov=0.5

Library

library(dplyr)
library(data.table)
library(ggplot2)
library(readxl)
library(plyr)
library(tidyr)
library(stringr)
library(igraph)
library(RColorBrewer)
library(grid) 
library(gridExtra)
library(fields)
library(networkD3)
library(htmlwidgets)
library(ggrepel)

Read in data

# Read in metadata
f3_exome=read.csv("output/f3/f3.metadata.csv", check.names = FALSE)
## Ensure it is in the correct order
f3_exome=f3_exome[order(f3_exome$Sample),]
# Read in filter table
filter.table=read.csv("output/f3/f3.filtertable.csv", check.names = FALSE)

Plot f3 PCAs

plot.PCA=function(subsps, meta.data, input.dir, output.file=NULL, label=NULL, return.df=FALSE){
  # Create list to store result data frames in
  results=list()
  # BAM file lists are always given in alphabetical order (with respect to sample name) so it is important to ensure the samples are in this order
  meta.data=meta.data[order(meta.data$Sample),]
  # Add label column, if no column is provided a blank label column is used
  meta.data$label=''
  if(!is.null(label)){meta.data$label=meta.data[[label]]}
  # Loop over each subspecies provided
  for(subsp in subsps){
    title=""
    meta.data.subsp=meta.data
    # Subspecies options
    if(subsp=="c"){title="Central"}
    if(subsp=="e"){title="Eastern"}
    if(subsp=="n"){title="Nigeria-Cameroon"}
    if(subsp=="w"){title="Western"}
    if(subsp=="all"){title="All Samples"
      # Group defines how to draw polygons
      group="Subspecies"
      input=paste0(input.dir,"f3.0.5x.", subsp,".cov")}
    if(subsp != "all"){
      # Group defines how to draw polygons
      group="Community"
      input=paste0(input.dir, subsp,"/f3.0.5x.", subsp,".cov")
      if(title!=""){meta.data.subsp=meta.data[meta.data$Subspecies==title,]}}
    # Read in PCAngsd output
    cov=read.table(input)
    pcs=eigen(cov)
    # Select columns corresponding to PC1 and PC2
    pc1.pc2=as.data.frame(pcs$vectors[,1:2])
    colnames(pc1.pc2)=c("f3.PC1", "f3.PC2")
    # Add PC1 and PC2 columns to metadata
    ## The order of samples in PCAngsd output should be alphabetical as the BAM file lists were written in this order
    pc1.pc2=cbind(meta.data.subsp, pc1.pc2)
    # Percentage variance explained by PCs
    PC1.percent=pcs$values[1]/sum(pcs$values)*100
    PC2.percent=pcs$values[2]/sum(pcs$values)*100
    # Plot
    PC1.lab=paste("PC1 (", round(PC1.percent,digits = 2),"%)")
    PC2.lab=paste("PC2 (", round(PC2.percent,digits = 2),"%)")
    find_hull=function(pc1.pc2) pc1.pc2[chull(pc1.pc2[,'f3.PC1'], pc1.pc2[,'f3.PC2']), ]
    hulls=ddply(pc1.pc2, group, find_hull)
    # Plot all samples
    if(subsp=="all"){
      plot=ggplot(pc1.pc2, aes(x=f3.PC1, y=f3.PC2, fill=Subspecies, colour=Subspecies, label = label)) +
        theme_minimal()+ 
        geom_point(aes(shape=Subspecies), size = 2) + 
        geom_text_repel(size = 3, min.segment.length = unit(0.1, "lines"), show.legend = FALSE) +
        geom_polygon(data = hulls, alpha = 0.5) + 
        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), legend.key.size = unit(0.4, "cm"), plot.title = element_text(hjust = 0.5)) +
        guides(fill=guide_legend(ncol=1)) + 
        labs(x=paste(PC1.lab), y=paste(PC2.lab)) + 
        ggtitle(title) +
        scale_fill_manual(values = c('green3', 'darkorange', 'red', 'blue')) +
        scale_color_manual(values = c('green3', 'darkorange', 'red', 'blue')) +
        scale_shape_manual(values=0:4)
    }
    # Plot individual subspecies
    else{
      plot=ggplot(pc1.pc2, aes(x=f3.PC1, y=f3.PC2, fill=Site, colour=Site, label = label)) +
        theme_minimal()+ 
        geom_point(aes(shape=Site), size = 2) + 
        geom_text_repel(size = 3, min.segment.length = unit(0.1, "lines"), show.legend = FALSE) +
        geom_polygon(data = hulls, alpha = 0.5) + 
        theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), legend.key.size = unit(0.4, "cm"), plot.title = element_text(hjust = 0.5)) +
        guides(fill=guide_legend(ncol=1), shape=guide_legend(ncol=1)) + 
        labs(x=paste(PC1.lab), y=paste(PC2.lab)) + 
        ggtitle(title) +
        scale_shape_manual(values=rep(0:6, 10))
    }
    # If an output file is provided, write to it
    if(!is.null(output.file)){ggsave(paste0(output.file), plot=plot)}
    # Display plot
    print(plot)
    # Save metadata with PC1 and PC2 columns added for the subspecies in results list
    results[[subsp]]=pc1.pc2
  }
  # If return.df is set to TRUE, return the results list
  if(return.df==T){return(results)}
}

All Samples

exome_pc1.hc.c.rel_pcs=plot.PCA("all", 
         f3_exome,
         paste0("../../pcangsd/exome/output/f3.0.5x.all_minInd.15_doMajorMinor.1_HWE.p.1e-3_snp.p.1e-6_minMAF.0.01/"),
         return.df=T)

exome_pc1.hc.c.rel_pcs.subsp=plot.PCA(c("c", "w"), 
         f3_exome,
         paste0("../../pcangsd/exome/output/f3.0.5x.subsp_minInd.15_doMajorMinor.1_HWE.p.1e-3_snp.p.1e-6_minMAF.0.01/"),
         return.df=T)

Nothing changes in any PCA apart from th samples which were filtered out are removed. This presumably is because only a small number of samples and if they could not cluster correctly they were likely not providing much information anyway.

f3 NGSadmix

I run NGSadmix on myriad to further investigate samples which looked problematic in the PCA analysis. NGSadmix attempts to form K discrete clusters allowing us to see more explicitly how well samples cluster with their communities. It also helps in the identification of swapped samples and possible migrants.

Scripts are in /home/ucfajos/analysis/phase1and2_exome_analysis/ngsadmix/scripts/ and called run_ngsadmix_all_K2to10_10runs.sh, run_ngsadmix_c_K2to10_10runs.sh etc. NGSadmix is run 10 times for each K and the result with the highest likelihood is plotted.

# Example NGSadmix shell script. This is for all samples (not subspecies specific)
INDIR="../../angsd/exome/output/f3.0.5x.all_minInd.15_doMajorMinor.1_HWE.p.1e-3_snp.p.1e-6_minMAF.0.01_beagle"
# Load modules
module load htslib
# Run NGSadmix
for K in {2..10}
        do
        for RUN in {1..10}
                do
                /usr/bin/time --verbose \
                /home/ucfajos/bin/angsd/misc/NGSadmix \
                -likes ${INDIR}/f3.0.5x.all.beagle.gz \
                -K $K \
                -P 10 \
                -o f3.0.5x.all_K${K}_run${RUN} 
                done
        done

Copy results to local machine.

scp -r myriad:/home/ucfajos/Scratch/output/phase1and2_exome_output/ngsadmix_output/mapped.on.target/\* /Users/harrisonostridge/OneDrive - University College London/Projects/Ostridge_PanAf/ngsadmix/exome/output

Selecting the best runs and K values

These bash scripts pull the log likelihood out of the NGSadmix result and organises the results in a table. This is done in bash rather than R as the log file is not tabular making manipulation in R very difficult.

All

pwd
# bash
INDIR='../../ngsadmix/exome/output/f3.0.5x.all_minInd.15_doMajorMinor.1_HWE.p.1e-3_snp.p.1e-6_minMAF.0.01'
OUTDIR='output/f3/ngsadmix/f3.0.5x.all_minInd.15_doMajorMinor.1_HWE.p.1e-3_snp.p.1e-6_minMAF.0.01'
echo k run log >> ${OUTDIR}/k_logs.TEMP.txt
for K in {2..10}
  do
  for RUN in {1..10}
     do
     # Look for line with 'best' in it, split the string at spaces and take the second element then split it by '-' and take the second element and you have the log likelihood
     LOG=`grep best ${INDIR}/f3.0.5x.all_K${K}_run${RUN}.log | cut -d' ' -f 2 | cut -d'=' -f 2`
     # By saving as a temporary file and then moving it to its final name we stop the file from just continually growing each time we run this chunk
    echo $K $RUN $LOG >> ${OUTDIR}/k_logs.TEMP.txt
  done
done
mv ${OUTDIR}/k_logs.TEMP.txt ${OUTDIR}/k_logs_all.txt
## /Users/harrisonostridge/Library/CloudStorage/OneDrive-UniversityCollegeLondon/Projects/Ostridge_PanAf/sample_filtering/exome
# Read table with log liklihoods of each run into R
k_logs_all=fread('output/f3/ngsadmix/f3.0.5x.all_minInd.15_doMajorMinor.1_HWE.p.1e-3_snp.p.1e-6_minMAF.0.01/k_logs_all.txt')
# Select run with maximum likelihood per K
max_logs_all=k_logs_all %>% group_by(k) %>% slice(which.max(log))

We select the run with the highest log-likelihood i.e. least negative e.g. a run with a log-likelihood of -2 would be selected over one with a log-likelihood of -5. I have manually checked that this is what the code is doing.

Subspecies

pwd
for SUBSP in c w
  do
  # bash
  INDIR='../../ngsadmix/exome/output/f3.0.5x.subsp_minInd.15_doMajorMinor.1_HWE.p.1e-3_snp.p.1e-6_minMAF.0.01'
  OUTDIR='output/f3/ngsadmix/f3.0.5x.subsp_minInd.15_doMajorMinor.1_HWE.p.1e-3_snp.p.1e-6_minMAF.0.01'
  echo k run log >> ${OUTDIR}/${SUBSP}/k_logs.TEMP.txt
  for K in {2..10}
    do
    for RUN in {1..10}
       do
       # Look for line with 'best' in it, split the string at spaces and take the second element then split it by '-' and take the second element and you have the log likelihood
       LOG=`grep best ${INDIR}/${SUBSP}/f3.0.5x.${SUBSP}_K${K}_run${RUN}.log | cut -d' ' -f 2 | cut -d'=' -f 2`
      # By saving as a temporary file and then moving it to its final name we stop the file from just continually growing each time we run this chunk
      echo $K $RUN $LOG >> ${OUTDIR}/${SUBSP}/k_logs.TEMP.txt
    done
  done
  mv ${OUTDIR}/${SUBSP}/k_logs.TEMP.txt ${OUTDIR}/${SUBSP}/${SUBSP}_k_logs.txt
done
## /Users/harrisonostridge/Library/CloudStorage/OneDrive-UniversityCollegeLondon/Projects/Ostridge_PanAf/sample_filtering/exome
# Prepare list to store results in
max_logs_subsp=list()
# For each subspecies...
for(subsp in c('c','w')){
  # Read table with log likelihoods of each run into R
  k_logs=fread(paste0('output/f3/ngsadmix/f3.0.5x.subsp_minInd.15_doMajorMinor.1_HWE.p.1e-3_snp.p.1e-6_minMAF.0.01/', subsp,"/", subsp, "_k_logs.txt"), fill = TRUE)
  # Select run with maximum likelihood per K
  max_logs_subsp[[subsp]]=k_logs %>% group_by(k) %>% slice(which.max(log))
}

Plot f3 NGSadmix Results

# Plotting function
plot.NGSadmix=function(meta.data, max.logs, input.prefix, output.prefix=NULL, title=NULL, Ks=2:10, group="Site"){
  # This function plots the results from NGSadmix as stacked bar charts.
  # Ensure samples are in the correct order (this is the order of the BAM file list input for ANGSD and therefore the order of input for NGAadmix)
  meta.data=meta.data[order(meta.data$Sample),]
  # Loop over K values
  plots=list()
  for(K in Ks){
    # Select run with highest likelihood for this values of K
    run=paste0(max.logs[max.logs$k==K, 'run'])
    # Read in data for the run with the highest likelihood 
    q=read.table(paste0(input.prefix, "_K", K, "_run", run, ".qopt"))
    # I standardise the column names to 'group' rather than 'Site' or 'Subspecies' etc. so it is general
    meta.data$Group=meta.data[,group]
    # cbind samples and group to NGSadmix results
    meta.data.q=cbind(meta.data, q)
    # Change to long format
    df=meta.data.q %>% pivot_longer(colnames(meta.data.q)[(1+ncol(meta.data.q)-K):ncol(meta.data.q)], names_to = "key", values_to = "value")
    # Ordering individuals
    ## Make empty df so each population can be dealt with separately and rbinded together at the end
    df.all=data.frame()
    max.keys=data.frame()
    ## For each population, ensure that samples are ordered according to proportion of ancestry from the major ancestral population for the modern
    for(pop in unlist(unique(df$Group))){
      # Select rows corresponding to each population 
      df.pop=df[df$Group==pop, ]
      # Sum the proportions of each ancestral population
      value.per.key=tapply(df.pop$value, df.pop$key, FUN=sum)
      # Select the ancestral population that has contributed the most to the modern population
      max.key=rownames(data.frame(which.max(value.per.key)))
      # Calculate the proportion of ancestry this ancestral population contributed to modern population
      max.key.prop=value.per.key[which.max(value.per.key)]/length(unique(df.pop$Sample))
      max.keys=rbind(max.keys, cbind(pop, max.key, max.key.prop))
      # Select rows corresponding to this ancestral population and order by proportion
      df.pop.max.key=df.pop[df.pop$key==max.key, ]
      df.pop.max.key=df.pop.max.key[order(df.pop.max.key$value),]
      # Important to have leading 0s on the numbers or the ordering doesn't work
      sample.order=data.frame(cbind(df.pop.max.key$Sample, paste0(sprintf('%0.3d', 1:length(df.pop.max.key$Sample)), pop)))
      colnames(sample.order)=c("Sample", "order")
      df.pop=merge(df.pop, sample.order, by="Sample", all.x=T)
      df.all=rbind(df.all, df.pop)
    }
    # Ordering populations
    df.all=merge(df.all, max.keys, by.x="Group", by.y="pop")
    # Order by the main ancestral population then order populations by the proportion of ancestry contribution
    max.keys=max.keys[order(max.keys$max.key, max.keys$max.key.prop),]
    # Make group type factor so ggplot follows the ordering
    df.all$Group=factor(df.all$Group,levels=unique(max.keys$pop))
    # Plot
    Plot=ggplot(df.all, aes(as.factor(order), value, fill=key)) +
      geom_col(position = "fill", width = 1) + 
      facet_grid(.~Group, space="free", scales="free_x") + 
      theme_minimal() + 
      scale_y_continuous(expand = c(0, 0)) + 
      scale_fill_manual(values = colorRampPalette(brewer.pal(K, "Set1"))(K)) + 
      labs(title=paste0(title, " K=", K), x="", y="") +
      scale_x_discrete(expand = c(0,0)) + 
      theme(plot.title = element_text(hjust = 0.5), legend.position="none", axis.text.x = element_blank(), 
            strip.text.x = element_text(angle=90, hjust=0), panel.background = element_rect(fill = NA, color=NA))
    ## In order to prevent the titles being cropped, I turn the plot into a Grob
    ### I saved the grobs for each K in a list 
    plots[[K]] <- ggplotGrob(Plot)
    ### Select each element beginning with "strip-t" (indicating parameters related to titles at the top (hence "-t"))
    for(i in which(grepl("strip-t", plots[[K]]$layout$name))){plots[[K]]$grobs[[i]]$layout$clip <- "off"}
    # Save as pdf id the option is selected 
    if(!is.null(output.prefix)){ggsave(paste0(output.prefix, "_K", K, ".pdf"), plot=plots[[K]])}
    # Display plot
    grid.arrange(plots[[K]])
  }
}

All

plot.NGSadmix(meta.data=f3_exome, 
              max.logs=max_logs_all, 
              input.prefix='../../ngsadmix/exome/output/f3.0.5x.all_minInd.15_doMajorMinor.1_HWE.p.1e-3_snp.p.1e-6_minMAF.0.01/f3.0.5x.all', 
              title="All Samples", 
              Ks=2:10, 
              group="subsp.or.f2.pca.probem.samples")

Central

plot.NGSadmix(meta.data=f3_exome[f3_exome$Subspecies=="Central",], 
              max.logs=max_logs_subsp[['c']], 
              input.prefix='../../ngsadmix/exome/output/f3.0.5x.subsp_minInd.15_doMajorMinor.1_HWE.p.1e-3_snp.p.1e-6_minMAF.0.01/c/f3.0.5x.c', 
              title="All Samples", 
              Ks=2:10, 
              group="site.or.f2.pca.probem.samples")

Western

plot.NGSadmix(meta.data=f3_exome[f3_exome$Subspecies=="Western",], 
              max.logs=max_logs_subsp[['w']], 
              input.prefix='../../ngsadmix/exome/output/f3.0.5x.subsp_minInd.15_doMajorMinor.1_HWE.p.1e-3_snp.p.1e-6_minMAF.0.01/w/f3.0.5x.w', 
              title="All Samples", 
              Ks=2:10, 
              group="site.or.f2.pca.probem.samples")

Central sample swap

CMNP1-24 is a sample swap and should be from either Conkouati or Loango but can’t tell for sure. I here I run PCAngsd and NGSadmix on only CMNP1-24 and these two populations to assign the sample to the correct community.

f3_exome_c.swap=f3_exome[f3_exome$Sample=="CMNP1-24" | f3_exome$Community=="c.Conkouati" | f3_exome$Community=="c.Loango",]

# PCA
c.swap.pca=plot.PCA("c.swap", 
         meta.data=f3_exome_c.swap,
         input.dir=paste0("../../pcangsd/exome/output/f3.0.5x.c.swap_minInd.5_doMajorMinor.1_HWE.p.1e-3_snp.p.1e-6_minMAF.0.05/"),
         label="Sample",
         return.df=T)

# PC1 vc coverage
ggplot(c.swap.pca[['c.swap']], aes(x=f3.PC1, y=Coverage, label=Sample, colour=Community)) +
    theme_minimal()+ 
    geom_text(size = 3) +
    theme(plot.title = element_text(hjust = 0.5)) +
    labs(x="PC1", y="Coverage") 

# PC1 vs human contamination 
ggplot(c.swap.pca[['c.swap']], aes(x=f3.PC1, y=`Human Contamination (%)`, label=Sample, colour=Community)) +
    theme_minimal()+ 
    geom_text(size = 3) +
    theme(plot.title = element_text(hjust = 0.5)) +
    labs(x="PC1", y="Coverage") 

It is still not possible to distinguish which of the two the sample is from. The three samples with high PC1 values don’t have extreme coverage or contamination so I don’t think PC1 is driven by quality.

pwd
# bash
INDIR='../../ngsadmix/exome/output/f3.0.5x.c.swap_minInd.5_doMajorMinor.1_HWE.p.1e-3_snp.p.1e-6_minMAF.0.05'
OUTDIR='output/f3/ngsadmix/f3.0.5x.c.swap_minInd.5_doMajorMinor.1_HWE.p.1e-3_snp.p.1e-6_minMAF.0.05'
echo k run log >> ${OUTDIR}/k_logs.TEMP.txt
for K in {2..4}
  do
  for RUN in {1..10}
     do
     # Look for line with 'best' in it, split the string at spaces and take the second element then split it by '-' and take the second element and you have the log likelihood
     LOG=`grep best ${INDIR}/f3.0.5x.c.swap_K${K}_run${RUN}.log | cut -d' ' -f 2 | cut -d'=' -f 2`
     # By saving as a temporary file and then moving it to its final name we stop the file from just continually growing each time we run this chunk
    echo $K $RUN $LOG >> ${OUTDIR}/k_logs.TEMP.txt
  done
done
mv ${OUTDIR}/k_logs.TEMP.txt ${OUTDIR}/k_logs_c.swap.txt
## /Users/harrisonostridge/Library/CloudStorage/OneDrive-UniversityCollegeLondon/Projects/Ostridge_PanAf/sample_filtering/exome
# Read table with log liklihoods of each run into R
k_logs_c.swap=fread('output/f3/ngsadmix/f3.0.5x.c.swap_minInd.5_doMajorMinor.1_HWE.p.1e-3_snp.p.1e-6_minMAF.0.05/k_logs_c.swap.txt')
# Select run with maximum likelihood per K
max_logs_c.swap=k_logs_c.swap %>% group_by(k) %>% slice(which.max(log))
plot.NGSadmix(meta.data=f3_exome_c.swap, 
              max.logs=max_logs_c.swap, 
              input.prefix='../../ngsadmix/exome/output/f3.0.5x.c.swap_minInd.5_doMajorMinor.1_HWE.p.1e-3_snp.p.1e-6_minMAF.0.05/f3.0.5x.c.swap', 
              title="", 
              Ks=2:4, 
              group="site.or.f2.pca.probem.samples")

Still cannot confidently assign this sample.

Caludia’s results

Claudia found that CMNP1-24 was from Conkouati using rare alleles. I will use this.

Sort swaps

Reassign CMNP1-24: I cannot tell from my own analysis exactly which community this sample came from howveer Claudia used rare allele techniques on the chr21 and found it was from Conkouati. I will therefore reassign this sample.

Reassign Gco4-2: This sample is clearly from Mt Sangbe.

f4_exome=f3_exome
# Reassign CMNP1-24 to 
f4_exome[f4_exome$Sample=='CMNP1-24', c('Site', 'Country', 'Community', 'Longitude', 'Latitude')]=unique(f4_exome[f4_exome$Community=='c.Conkouati', c('Site', 'Country', 'Community', 'Longitude', 'Latitude')])
# Reassign Gco4-2 to MtSangbe
f4_exome[f4_exome$Sample=='Gco4-2', c('Site', 'Country', 'Community', 'Longitude', 'Latitude')]=unique(f4_exome[f4_exome$Community=='w.MtSangbe', c('Site', 'Country', 'Community', 'Longitude', 'Latitude')])
write.table(f4_exome[,1:20], "output/f4/f4.metadata.csv", sep=",", row.names=FALSE, col.names=TRUE, quote=FALSE)

Write BAM file lists

Only write for the two communities with swapped samples reassigned to them so I can run NGSrelate to check they are not related.

for(com in c('c.Conkouati','w.MtSangbe')){
  f4_exome.com=f4_exome[f4_exome$Community==com, ]
  write.table(f4_exome.com$BAM.mapped.on.target,
              paste0("output/bam.filelists/exome/f4_pc1_hc.1pct_coverage.0.5x_rm.rel_rm.outliers_sort.swaps/bam.filelist.", com),
              row.names=FALSE, col.names=FALSE, quote=FALSE)
}
scp output/bam.filelists/exome/f4_pc1_hc.1pct_coverage.0.5x_rm.rel_rm.outliers_sort.swaps/* myriad:/home/ucfajos/analysis/phase1and2_exome_analysis/angsd/bam.filelists/mapped.on.target/f4_pc1_hc.1pct_coverage.0.5x_rm.rel_rm.outliers_sort.swaps/